package irie.util;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

public class TestRegex {
	private static String readString(int n) {
		StringBuffer str = new StringBuffer("");
		File file = new File("D:\\webmagic\\mm.10086.cn", n + ".txt");
		try {
			FileReader fr = new FileReader(file);
			int ch = 0;
			while ((ch = fr.read()) != -1) {
				str.append((char) ch);
			}
			fr.close();
		} catch (IOException e) {
			e.printStackTrace();
			System.out.println("File reader出错");
		}
		return str.toString();
	}

	public String ex(int n){
		StringBuffer s = new StringBuffer("");
		String str = this.readString(n);
		//int loc=str.indexOf("div");
		int loc1=str.indexOf("<div class=\"mj_info font-f-yh\">");
		int loc2=str.indexOf("<!--含可选收费项目begin-->");
		//System.out.println(loc1+","+loc2);
		str=str.substring(loc1, loc2);
		loc1=str.indexOf("</span></li>");
		loc2=str.lastIndexOf("</li>");
		str=str.substring(loc1+13,loc2);
		//System.out.println(str);
		loc1=str.indexOf("<a");
		loc2=str.lastIndexOf("\">");
		String a=str.substring(1,loc1);
		String b=str.substring(loc2+2,str.length());
		s.append(a);s.append(b);
		str=s.toString().replaceAll(" ","");
		str=str.toString().replaceAll("<li>","");
		str=str.toString().replaceAll("</li>",",");
		str=str.toString().replaceAll("</a>","");
	    System.out.println(str);
		return str;
	}

	public static void main(String[] args) {
		TestRegex tr = new TestRegex();
		//tr.readString(1);
		tr.ex(2);
	}

}
